BiocManager::install("rGREAT")
## Bioconductor version 3.20 (BiocManager 1.30.25), R 4.4.3 (2025-02-28)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'rGREAT'
## Old packages: 'data.table', 'generics', 'softImpute', 'zip'
suppressPackageStartupMessages({
  library(GenomicRanges)
  library(epiwraps)
  library(ggplot2)
  library(rGREAT) # Gene Ontology enrichment among genomic regions
  library(MotifDb)
  library(universalmotif)
  library(ggplot2)
  library(SummarizedExperiment) # data structure
  library(sechm) # for plotting heatmaps from a SummrizedExperiment
  library(BiocParallel) # for multithreading
  library(chromVAR) # for motif accessibility estimation
  library(limma) # for statistical analysis
  library(TFBSTools)
  library(Biostrings)
})
## See system.file("LICENSE", package="MotifDb") for use restrictions.
# to control multithreading, unix users can use:
register(MulticoreParam(4))
# for windows users, rather one of the following:
# register(SerialParam()) # this will disable multi-threading
# register(SnowParam(2))
system("which gfortran")
system("gfortran --version")
Sys.getenv("PATH")
## [1] "/Library/Frameworks/Python.framework/Versions/3.12/bin:/Library/Frameworks/Python.framework/Versions/3.10/bin:/Library/Frameworks/Python.framework/Versions/3.7/bin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin:/opt/X11/bin:/Library/TeX/texbin:/Users/florencemarti/Downloads/sratoolkit.3.2.1-mac-x86_64/bin:/Applications/quarto/bin:/usr/texbin:/Applications/RStudio.app/Contents/Resources/app/quarto/bin:/Applications/RStudio.app/Contents/Resources/app/bin/postback"

#1. Analysis ##1.a Download the data

options(timeout=6000)
download.file("https://ethz-ins.org/content/w10.assignment.zip", "archive.zip", mode="wb")
unzip("archive.zip")
list.files()
peaks <- list.files(pattern="bed$")
# we first import the peaks
peaks <- lapply(peaks, rtracklayer::import.bed)
# we'll focus on the high-quality peaks
peaks <- lapply(peaks, FUN=function(x) x[x$score>800])
# we get the union of non-redundant regions
regions <- reduce(unlist(GRangesList(peaks)))
tracks <- list.files(pattern = "bw$")
creb_bed_files <- grep("creb", peaks, value = TRUE, ignore.case = TRUE)
creb_bw_files <- grep("creb", tracks, value = TRUE, ignore.case = TRUE)

# Create a subset of peaks only for CREB-related BED files
creb_peaks <- peaks[peaks %in% creb_bed_files]

# Create a subset of tracks only for CREB-related BigWig files
creb_tracks <- tracks[tracks %in% creb_bw_files]

1.b Plot

ese <- signal2Matrix(creb_tracks, regions, extend=2000)
## Reading Creb1.bw
## Reading Creb3.bw
## Reading Creb3L1.bw
plotEnrichedHeatmaps(ese)

ese2 <- ese[1:1000,]
plotEnrichedHeatmaps(ese2, cluster_rows = TRUE, show_row_dend=TRUE )

1.c Clustering

Use clustering and visualization to illustrate the relationship between the binding of the different proteins

cl2 <- clusterSignalMatrices(ese, k=2:10)
ggplot(cl2$varExplained, aes(k, varExplained)) + geom_line()

set.seed(123)  # to ensure that it gives the same results everytime
cl <- clusterSignalMatrices(ese, k=7)
##   ~79% of the variance explained by clusters
table(cl)
## cl
##   1   2   3   4   5   6   7 
## 405  92 212 495 507 247 311
head(cl)
## [1] 5 4 1 1 1 4
## Levels: 1 2 3 4 5 6 7
length(cl)
## [1] 2269
length(regions)
## [1] 2269
# to make sure the cluster labels stay associated with the corresponding regions/rows
# even if we manipulate the object, put them inside the rowData of the object:
rowData(ese)$cluster <- cl
head(rowData(ese))
## DataFrame with 6 rows and 1 column
##                     cluster
##                    <factor>
## chr1:778487-779069        5
## chr1:904436-904985        4
## chr1:941785-942096        1
## chr1:943126-943501        1
## chr1:959088-959463        1
## chr1:961148-961904        4
mycolors <- c("1"="pink", "2"="lightblue", "3"="darkgreen", "4"="orange", "5"="purple", "6"="blue", "7"="lightgreen")
plotEnrichedHeatmaps(ese, row_split="cluster", mean_color=mycolors, colors=c("white","darkred"))

CREB1 and CREB3L1 have similar binding patterns, even though CREB3L1 is closely related to CREB3.

d <- meltSignals(ese, splitBy=cl)
ggplot(d, aes(position, mean, colour=sample)) + geom_line(size=1.2) + facet_wrap(~split)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

1.d Clustering using relative signal instead:

cl <- clusterSignalMatrices(ese, k=7, scaleRows = TRUE)
##   ~94% of the variance explained by clusters
d <- meltSignals(ese, splitBy=cl)
ggplot(d, aes(position, mean, colour=sample)) + geom_line() + facet_wrap(~split)

plotEnrichedHeatmaps(ese, row_split = cl, scale_rows = "global")

2. Enrichment Analyis

Cluster 3

# we first split the regions by cluster:
split_regions <- split(rowRanges(ese), rowData(ese)$cluster)
lengths(split_regions)
##   1   2   3   4   5   6   7 
## 405  92 212 495 507 247 311
res <- great(split_regions[["3"]], gene_sets="GO:BP", tss_source="hg38", 
             background=regions, cores=2)
## * TSS source: TxDb.
## * check whether TxDb package 'TxDb.Hsapiens.UCSC.hg38.knownGene' is installed.
## * gene ID type in the extended TSS is 'Entrez Gene ID'.
## * restrict chromosomes to 'chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10,
##     chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX,
##     chrY, chrM'.
## * 18110/30830 protein-coding genes left.
## * update seqinfo to the selected chromosomes.
## * TSS extension mode is 'basalPlusExt'.
## * construct the basal domains by extending 5000bp to upstream and 1000bp to downsteram of TSS.
## * calculate distances to neighbour regions.
## * extend to both sides until reaching the neighbour genes or to the maximal extension.
## * use GO:BP ontology with 15413 gene sets (source: org.Hs.eg.db).
## * check gene ID type in `gene_sets` and in `extended_tss`.
## * use self-defined background regions.
## * remove excluded regions from background.
## * overlap `gr` to background regions (based on midpoint).
## * in total 212 `gr`.
## * overlap extended TSS to background regions.
## * check which genes are in the gene sets.
## * only take gene sets with size >= 5.
## * in total 2480 gene sets.
## * overlap `gr` to every extended TSS.
## * perform binomial test for each biological term.
bp <- getEnrichmentTables(res)
head(bp)
##           id                      description genome_fraction
## 1 GO:0046649            lymphocyte activation       0.1141667
## 2 GO:0050896             response to stimulus       0.6333121
## 3 GO:0051716    cellular response to stimulus       0.5674445
## 4 GO:0044237       cellular metabolic process       0.7306017
## 5 GO:0045321             leukocyte activation       0.1249278
## 6 GO:0032501 multicellular organismal process       0.5649184
##   observed_region_hits fold_enrichment      p_value   p_adjust mean_tss_dist
## 1                   45        1.859247 3.042332e-05 0.02664705        116862
## 2                  160        1.191698 1.088204e-04 0.02664705        112782
## 3                  147        1.221963 1.118256e-04 0.02664705        121534
## 4                  178        1.149221 1.198092e-04 0.02664705        108666
## 5                   46        1.736853 1.259111e-04 0.02664705        116214
## 6                  146        1.219077 1.475089e-04 0.02664705        116126
##   observed_gene_hits gene_set_size fold_enrichment_hyper p_value_hyper
## 1                 24            72              1.819499   0.001248413
## 2                137           642              1.164820   0.004252981
## 3                117           542              1.178310   0.007385788
## 4                143           713              1.094762   0.048972072
## 5                 25            82              1.664176   0.004049256
## 6                109           537              1.107963   0.074782394
##   p_adjust_hyper
## 1      0.1024531
## 2      0.1545510
## 3      0.1683686
## 4      0.2517716
## 5      0.1545510
## 6      0.2916795
res_cc <- great(split_regions[["3"]], gene_sets="GO:CC", tss_source="hg38", 
             background=regions, cores=2)
## * TSS source: TxDb.
## * extended_tss is already cached, directly use it.
## * use GO:CC ontology with 1999 gene sets (source: org.Hs.eg.db).
## * check gene ID type in `gene_sets` and in `extended_tss`.
## * use self-defined background regions.
## * remove excluded regions from background.
## * overlap `gr` to background regions (based on midpoint).
## * in total 212 `gr`.
## * overlap extended TSS to background regions.
## * check which genes are in the gene sets.
## * only take gene sets with size >= 5.
## * in total 381 gene sets.
## * overlap `gr` to every extended TSS.
## * perform binomial test for each biological term.
cc <- getEnrichmentTables(res_cc)
head(cc)
##           id                              description genome_fraction
## 1 GO:0071944                           cell periphery       0.4400013
## 2 GO:0005886                          plasma membrane       0.3999239
## 3 GO:0043226                                organelle       0.9016456
## 4 GO:0043231 intracellular membrane-bounded organelle       0.8534896
## 5 GO:0005654                              nucleoplasm       0.4206354
## 6 GO:0043229                  intracellular organelle       0.8843347
##   observed_region_hits fold_enrichment      p_value   p_adjust mean_tss_dist
## 1                  120        1.286446 0.0001521473 0.01671530        123010
## 2                  111        1.309211 0.0001803972 0.01671530        113750
## 3                  205        1.072463 0.0002639258 0.01671530        114691
## 4                  197        1.088760 0.0005360609 0.02546289        114879
## 5                  112        1.255961 0.0010174977 0.02919191        101635
## 6                  201        1.072120 0.0011372293 0.02919191        114567
##   observed_gene_hits gene_set_size fold_enrichment_hyper p_value_hyper
## 1                 93           412              1.232137   0.005291146
## 2                 87           371              1.280025   0.002119558
## 3                196          1036              1.032689   0.180029741
## 4                175           912              1.047409   0.137611072
## 5                 75           342              1.197039   0.029464253
## 6                184           985              1.019659   0.321605073
##   p_adjust_hyper
## 1     0.11152344
## 2     0.08054319
## 3     0.52001858
## 4     0.52001858
## 5     0.27862703
## 6     0.61104964

3. Plotting of BP GO Terms for Cluster 3:

ggplot(head(bp,10), aes(fold_enrichment, reorder(description, p_adjust), 
                        size=observed_region_hits, color=-log10(p_adjust))) + 
  geom_point() + scale_color_viridis_c() 

ggplot(head(cc,10), aes(fold_enrichment, reorder(description, p_adjust), 
                        size=observed_region_hits, color=-log10(p_adjust))) + 
  geom_point() + scale_color_viridis_c() 

Write a paragraph describing your results:

The signal profiles in cluster 3 for the three transcription factors show distinct binding dynamics. CREB1 has the strongest enrichment peak, closely followed by CREB3L1. In contrast, CREB3 shows minimal binding activity. This suggests that CREB1 and CREB3L1 may play key roles in regulating the GO terms associated with cluster 3.

Looking at the GO:BP results for cluster 3, several highly enriched terms are related to synaptic vesicle dynamics, including synaptic vesicle recycling, synaptic vesicle endocytosis, and presynaptic endocytosis. More general terms like vesicle localisation also show up, suggesting a role in signalling processes.

Since signalling isn’t something I usually work with, I also checked the GO:CC terms to get a better sense of the cellular context. Among the top enriched terms were extrinsic component of membrane, cell cortex, and cell-substrate junction, along with mitochondrion and endoplasmic reticulum. These components line up well with the biological processes and support the idea that CREB1 and CREB3L1 might be involved in regulating endocytosis and vesicle trafficking in a signalling context.